#!/usr/bin/perl -w
use warnings;
use strict;
use Carp qw(cluck :DEFAULT);
use Cwd qw||;
use Data::Dumper;
use File::Copy;
use File::Temp;
use File::Basename;
use Getopt::Long;
use Pod::Usage;
use Config::IniFiles;
use Bio::Seq;
use Bio::SeqIO;
use RG::PP::ACL;

$Carp::Verbose = 1; 

our %meth_ext_hash = (
  asp => 'profasp',
  blastPsiAli => 'blastpgp',
  'blastPsiAli.gz' => 'blastpgp',
  blastPsiMat => 'blastpgp',
  blastPsiOutTmp => 'blastpgp',
  blastPsiRdb => 'blastpgp',
  blastPsi80Rdb => 'blastpgp',
  blastpSwissM8 => 'blastpswiss',
  chk => 'blastpgp',
  clustalngz => 'psic',
  coils => 'coiledcoils',
  coils_raw => 'coiledcoils',
  'consurf.grades' => 'consurf',
  'consurf.html'  => 'consurf',
  disis => 'profdisis',
  disulfinder => 'disulfinder',
  fasta => 'input',
  globe => 'profglobe',
  hmm2pfam => 'hmmer',
  hmm3pfam => 'hmmer',
  hmm3pfamTbl => 'hmmer',
  hmm3pfamDomTbl => 'hmmer',
  hssp => 'blastpgp',
  hssp80 => 'blastpgp',
  hsspPsiFil => 'hssp_filter',
  'hsspPsiFil.gz' => 'hssp_filter',
  in => 'input',
  isis => 'profisis',
  loctreeAnimal => 'loctree',
  loctreeAnimalTxt => 'loctree',
  loctreePlant => 'loctree',
  loctreePlantTxt => 'loctree',
  loctreeProka => 'loctree',
  loctreeProkaTxt => 'loctree',
  'arch.lc2' => 'loctree2',
  'bact.lc2' => 'loctree2',
  'euka.lc2' => 'loctree2',
  'arch.lc3' => 'loctree3',
  'arch.lc3.pb' => 'loctree3',
  'arch.lc3.svm' => 'loctree3',
  'bact.lc3' => 'loctree3',
  'bact.lc3.pb' => 'loctree3',
  'bact.lc3.svm' => 'loctree3',
  'euka.lc3' => 'loctree3',
  'euka.lc3.pb' => 'loctree3',
  'euka.lc3.svm' => 'loctree3',
  mdisorder => 'metadisorder',
  'metastudent.BPO.txt' =>'metastudent',
  'metastudent.CCO.txt' =>'metastudent',
  'metastudent.MFO.txt' =>'metastudent',
  nls => 'predictnls',
  nlsDat => 'predictnls',
  nlsSum => 'predictnls',
  norsnet => 'norsnet',
  nors => 'nors',
  phdNotHtm => 'phd',
  phdPred => 'phd',
  phdRdb => 'phd',
  profAscii => 'prof',
  prona => 'prona',
  reprof => 'reprof',
  profbval => 'profbval',
  profb4snap => 'profbval',
  prof1Rdb => 'prof',
  profRdb => 'prof',
  proftmb => 'proftmb',
  proftmbdat => 'proftmb',
  prosite => 'prosite',
  psic => 'psic',
  safBlastPsi => 'blastpgp',
  safBlastPsi80 => 'blastpgp',
  segNormGCG => 'ncbi-seg',
  segNorm => 'ncbi-seg',
  seqGCG => 'input',
  sumNors => 'nors',
  tmhmm => 'tmhmm',
  tmseg => 'tmseg',
  somena => 'somena'
);

# SYSTEM CONFIGURATION RETRIEVAL
our $config;
BEGIN {
    our $VERSION = "__PACKAGE_VERSION__";
    delete @ENV{qw(IFS CDPATH ENV BASH_ENV)};
    $ENV{PATH} = "/bin:/usr/bin";
    my ( $defaultconfig, $etcconfig );
    if( -e "__pkgdatadir__/predictproteinrc.default" ) { $defaultconfig = Config::IniFiles->new( -file => "__pkgdatadir__/predictproteinrc.default" ); }
    if( -e "__sysconfdir__/predictproteinrc" ) { $etcconfig = Config::IniFiles->new( -file => "__sysconfdir__/predictproteinrc", -import => $defaultconfig ); } else { $etcconfig = $defaultconfig; }
    if( ( $ENV{PREDICTPROTEINCONF} && -e "$ENV{PREDICTPROTEINCONF}" ) || -e "$ENV{HOME}/.predictproteinrc" ) { $config = Config::IniFiles->new( -file => $ENV{PREDICTPROTEINCONF} || "$ENV{HOME}/.predictproteinrc", -import => $etcconfig ); } else { $config = $etcconfig; }
}

# popularity contest
if( system('pp_popcon_cnt', '-p', 'predictprotein') == -1 ){ warn("The Rost Lab recommends you install the pp-popularity-contest package that provides pp_popcon_cnt:\n\nsudo apt-get install pp-popularity-contest\n"); }

# NOTE: all configuration can be overridden by command line options
my $pp_rootdir  = glob($config->val('predictprotein', 'predictprotein_utildir')).'/';
#
my $bigblastdb = glob( $config->val('predictprotein', 'bigblastdb' ) );
my $big80blastdb = glob( $config->val('predictprotein', 'big80blastdb' ) );
my @default_targets = ( $config->val('predictprotein', 'default_targets') ? split(/ /o, $config->val('predictprotein', 'default_targets') ) : () );
my $librg_utils_perl = glob($config->val('predictprotein', 'librg_utils_perl')).'/' || '/usr/share/librg-utils-perl/';
my $numresmax = $config->val('predictprotein', 'numresmax') || 6000; # Marco's suggestion, we know it now blows up for over 19000 (convert_seq I think)
my $pfam2db = glob($config->val('predictprotein', 'pfam2db'));
my $pfam3db = glob($config->val('predictprotein', 'pfam3db'));
my $prodomblastdb; # obsolete, kept here to not break interfaces that still use this cmd line option
my $prof_app_root = glob($config->val('predictprotein', 'prof'));
my $profnumresmin = $config->val('predictprotein', 'profnumresmin') || 17;
my $prositedat = glob( $config->val('predictprotein', 'prositedat' ) );
my $prositeconvdat = glob( $config->val('predictprotein', 'prositeconvdat' ) );
my $psicexe = glob( $config->val('predictprotein', 'psicexe' ) );
my $spkeyidx = glob( $config->val('predictprotein', 'spkeyidx' ) );
my $swissblastdb = glob( $config->val('predictprotein', 'swissblastdb' ) );
my $use_cache = $config->val('predictprotein', 'use_cache');
#
my $root_dir = $pp_rootdir || "__pkgdatadir__";
my $make_file = $root_dir."/MakefilePP.mk";

#my $output_format = "text";
my $work_dir;

# COMMAND LINE OPTIONS
my( $help, $man, $dbg, $dry_run, $prot_name, $makedebug, @method, $output_dir, $print_ext_method_map, $seqfile, $sequence_in, $setacl, @target, $version);
#my $input_file;

my $cache_merge_cl;
my $force_cache_store = 0;
my $num_cpus = 1;
my $blast_processors = 1;

if (@ARGV < 1){ die "Usage: $0 [OPTIONS]\n";}

my $result = GetOptions (
          'acl|setacl=s' => \$setacl,
          'bigblastdb=s' => \$bigblastdb,
          'big80blastdb=s' => \$big80blastdb,
			    "blast-processors=i" => \$blast_processors,
			    "c|num-cpus=i" =>\$num_cpus,
          "cache-merge!" => \$cache_merge_cl,
			    "d|debug!" => \$dbg,
#			    "f|output-format=s" => \$output_format,
          "force-cache-store!" => \$force_cache_store,
			    "help|?" => \$help,
#          "i|input-file=s" => \$input_file,
			    "m|make-file=s" =>\$make_file,
          "makedebug=s" => \$makedebug,
			    "man" => \$man,
          'method=s@' => \@method,
          "numresmax=i" => \$numresmax,
			    "n|dryrun" => \$dry_run,
			    "o|output-dir=s" => \$output_dir,
			    "p|prot-name=s" =>\$prot_name,
          "pfam2db=s" => \$pfam2db,
          "pfam3db=s" => \$pfam3db,
          "print-ext-method-map!" => \$print_ext_method_map,
          "prodomblastdb=s" => \$prodomblastdb, # kept for compatibility
          "profnumresmin=i" => \$profnumresmin,
          "prositedat=s" => \$prositedat,
          "prositeconvdat=s" => \$prositeconvdat,
          "psicexe=s" => \$psicexe,
          "spkeyidx=s" => \$spkeyidx,
			    "s|seq|sequence=s" =>\$sequence_in,
          'seqfile=s' => \$seqfile,
          'swissblastdb=s' => \$swissblastdb,
          'target=s@' => \@target,
          'use-cache!' => \$use_cache,
			    "v|version" => \$version,
			    "w|work-dir=s" => \$work_dir
) or pod2usage(2);

pod2usage(0) if $help;
pod2usage(-verbose => 2) if $man;
if ($version){
    print STDERR qq|This is PredictProtein version __PACKAGE_VERSION__

Copyright 1992-2014, Rostlab

Please see the COPYING file for license information.

Complete documentation for PredictProtein should be found on
this system using "man predictprotein" or at http://www.predictprotein.org/.

|;
    exit (0);
}

if( $print_ext_method_map ) { foreach my $ext ( sort{$a cmp $b}(keys(%meth_ext_hash)) ){ print $ext, "\t", $meth_ext_hash{$ext}, "\n"; }; exit(0); }

if( $seqfile )
{
  my $in = Bio::SeqIO->new( ( $seqfile eq '-' ? ( -fh => \*STDIN ) : ( -file => scalar( glob( $seqfile ) ) ) ), -format => 'Fasta' );
  my $seq = $in->next_seq();
  $sequence_in = $seq->seq();
}
# Get sequence
$sequence_in || die("Error: no input sequence".( $dbg ? '' : "\n" ));

my( $makemethod, $maketarget, $hash_meth ) = _get_makemethod( \@method );
my $hash_meth_as_options = _get_method_options( $hash_meth );

my $cache_merge = 0;
{
  # lkajan: warning: meddling with default value ahead:
  # if --use-cache and --noforce-cache-store and --target is used and the cache is not empty, make $cache_merge ON by default, OFF otherwise
  $cache_merge = ( ( $use_cache && !$force_cache_store && @target ) ? 1 : 0 );
  if( defined( $cache_merge_cl ) ){ $cache_merge = $cache_merge_cl; }

  # lkajan: if there is nothing in the cache, silently ignore a request (on the cmd line or otherwise) for cache merging:
  if( $cache_merge && !_in_cache( $sequence_in, $hash_meth_as_options ) ){ $cache_merge = 0; }
}

if($dbg){ warn("cache merging is ".( $cache_merge ? 'on' : 'off' )); }

# force_cache_store, cache_merge_cl imply use-cache
if( $force_cache_store || $cache_merge_cl ){ $use_cache = 1; }

# force_cache_store can not be used together with cache_merge
if( $force_cache_store && $cache_merge ){ die("Error: --force-cache-store is incompatible with --cache_merge".( $dbg ? '' : "\n" )); }

if( !$output_dir && !$use_cache && !$work_dir ){ die("Error: no output directory or work directory given and no cache is to be used. Results would be lost unless you use --output-dir, --work-dir or --use-cache.".( $dbg ? '' : "\n" ) ); }
if( $output_dir )
{
  $output_dir = Cwd::realpath(glob($output_dir));
  system( 'mkdir', '-p', $output_dir ) && die( "could not mkdir '$output_dir': $?" );
}

if(! $prot_name){ $prot_name="query"; }
if( $prot_name =~ /[^[:alnum:]._-]/o ){ die("Error: invalid protein name, please use only [[:alnum:]._-]".( $dbg ? '' : "\n" )); }

# check acl syntax if given
if( $setacl ) { RG::PP::ACL::acl2hash( $setacl ); }

if( !@target ){ if( @default_targets ){ @target = @default_targets; } else { @target = ( 'all' ); } } # lkajan: attention: `all' does not mean all methods, there also are the `optional' ones
push @target, @$maketarget;

# there's no need to give a target more than once - collapse duplicates
{
  my %tgt_hash = map { ( $_ => 1 ); } @target;
  @target = keys %tgt_hash;
}

# get number of processors on this host
#my @num_cpus=();
#if (!($num_cpus) && -e '/proc/cpuinfo'){
#     @num_cpus=`grep processor /proc/cpuinfo`;
#     $num_cpus = scalar(@num_cpus);
#}

my $cache_dir;
my $hashlockhdl;
my $hashlockpid; END { if( $hashlockpid ){ local $?; kill 15, $hashlockpid; undef($hashlockpid); if( !close( $hashlockhdl ) ){ warn("failed to close ppc_lock pipe: ".( $! ? "$!" : " command exited with $?" )); } while(wait != -1){}; if($dbg){ warn("no children left"); } } }

if( $use_cache && !$force_cache_store )
{
  # Do we have it in the cache?
  my @ppf_cmd = _get_ppf_cmd_base( $sequence_in, $hash_meth_as_options );

  if( !$cache_merge )
  {
    # use cache, no force, no merge: fetch results to the output dir if not cache merge
    if( $output_dir ){ push @ppf_cmd, ( '--output-dir', $output_dir ); }
    if( $prot_name ){ push @ppf_cmd, ( '--prot-name', $prot_name ); }
  }
  else
  {
    # Obtain lock, keep ppc_fetch from locking
    my @ppl_cmd = ( 'ppc_lock', '--seq', $sequence_in, '--parent', $$, '--allow-cache-write', @$hash_meth_as_options );
    if( $dbg ){ cluck( "@ppl_cmd" ); }
    $hashlockpid = open( $hashlockhdl, '-|' ); if( !defined( $hashlockpid ) ){ die("failed to execute '|@ppl_cmd': $?"); }
    if( !$hashlockpid ){ if( !exec { $ppl_cmd[0] } @ppl_cmd ){ confess( "failed to call @ppl_cmd: $!" ); } }
    chomp( $cache_dir = <$hashlockhdl> );
    # lkajan 20110514: with acls -w does not report correctly
    if( !defined($cache_dir) || !-e $cache_dir ){ confess("error: $cache_dir".( -e $cache_dir ? '' : ' does not exist' ) ); }
    #
    push @ppf_cmd, '--skip-hash-lock';
  }

  if( $dbg ){ cluck( "@ppf_cmd" ); }

  my( @cachefiles, $closeret );
  {
    my $pid = open( my $pipe, '-|' ); if( !defined( $pid ) ){ die("failed to execute '|@ppf_cmd': $?"); }
    if( !$pid ){ if( !exec { $ppf_cmd[0] } @ppf_cmd ){ confess( "failed to call @ppf_cmd: $!" ); } }
    @cachefiles = <$pipe>;
    $closeret = close( $pipe );
    if( !$closeret && $! ){ confess("failed to call @ppf_cmd: $!"); }
  }
  #
  if( !$cache_merge && $closeret )
  {
    # in cache, so we do not run. The ppc_fetch call fetched results to output-dir if we had that set, nothing left to do.
    exit(0);
  }
}

# not in cache or merge or forced recalc
if( length($sequence_in) < $profnumresmin )
{
  $! = 254; die("ERROR: sequence is too short, shorter than minimum length required by prof ($profnumresmin)".( $dbg ? '' : "\n" ));
}
if( length($sequence_in) > $numresmax )
{
  $! = 253; die("ERROR: sequence is too long, longer than maximum length ($numresmax)".( $dbg ? '' : "\n" ));
}

# If not cache_merge, run in given or tmp work dir. If cache_merge then run directly on cache directory.
if( !$cache_merge )
{
  if( ! $work_dir ){ $work_dir = File::Temp::tempdir( CLEANUP => !$dbg ); }
  else { $work_dir = Cwd::realpath(glob($work_dir)); system ( "mkdir", '-p', $work_dir ) && die( "could not mkdir '$work_dir': $?" ); }
}
else
{
  $work_dir = $cache_dir;
}

# PRINT OUT JOB ENV
if ($dbg){ cluck "work_dir=$work_dir"; }

# work_seq_file holds sequence information and resides in intermediate work directory
my $work_seq_file = "$work_dir/$prot_name.in";

if( ! -e $work_seq_file )
{
  my $seqobj = Bio::Seq->new( -display_id => $prot_name, -seq => uc($sequence_in) );
  my $seqout= Bio::SeqIO->new( -format => 'Fasta', -file => ">$work_seq_file" );
  $seqout->write_seq($seqobj);
}


# RUN MAKEFILE
my @cmd = ();
push (@cmd,"make");
push (@cmd,"--no-builtin-rules");
push (@cmd,"INFILE=$prot_name.in");
push (@cmd,"-C", $work_dir);
push (@cmd,"JOBID=".$prot_name);
push (@cmd, "-n") if ($dry_run);
push (@cmd, "-j", $num_cpus);
if( defined($makedebug) ){ push (@cmd, "--debug=$makedebug" ); }
push (@cmd, "BLASTCORES=".$blast_processors);
push (@cmd, "LIBRGUTILS=".$librg_utils_perl);
push (@cmd, "PPROOT=".$pp_rootdir);
push (@cmd, "PROFNUMRESMIN=$profnumresmin");
push (@cmd, "PROFROOT=".$prof_app_root);
# resource locations e.g. BIGBLASTDB, BIG80BLASTDB, PROSITECONVDAT, SWISSBLASTDB
if( $bigblastdb ){ push @cmd, "BIGBLASTDB=$bigblastdb"; }
if( $big80blastdb ){ push @cmd, "BIG80BLASTDB=$big80blastdb"; }
if( $pfam2db ){ push @cmd, "PFAM2DB=$pfam2db"; }
if( $pfam3db ){ push @cmd, "PFAM3DB=$pfam3db"; }
if( $prositedat ){ push @cmd, "PROSITEDAT=$prositedat"; }
if( $prositeconvdat ){ push @cmd, "PROSITECONVDAT=$prositeconvdat"; }
if( $psicexe ){ push @cmd, "PSICEXE=$psicexe"; }
if( $spkeyidx ){ push @cmd, "SPKEYIDX=$spkeyidx"; }
if( $swissblastdb ){ push @cmd, "SWISSBLASTDB=$swissblastdb"; }
# method control params
push (@cmd, @$makemethod );
if( !$dbg ){ push @cmd, '--quiet'; } else { push (@cmd, "DEBUG=".$dbg); }
push( @cmd, $make_file ? ( "-f", Cwd::realpath(scalar(glob($make_file))) ) : () );

# list targets
my @cmd_all = ( @cmd, @target );

my $oldout;
if($dbg) { cluck("@cmd_all"); }
else
{
  # so the methods speak on standard out ... let's just silence them all
  open( $oldout, '>&', \*STDOUT ) || die( $! );
  open( STDOUT, '>', '/dev/null' ) || die( $! );
}
  #
    ##
      system(@cmd_all) && die( "@cmd_all failed: $?" );
    ##
  #
if( !$dbg ) { open( STDOUT, '>&', $oldout ) || die( $! ); }

# copy to output dir?
# We copy results to the output directory /before/ storing.
if( $output_dir )
{
  my @cmd_install = ( @cmd, "DESTDIR=$output_dir", 'install' );

  if ($dbg) { cluck("@cmd_install"); }
  system( @cmd_install ) && die( "@cmd_install DESTDIR=$output_dir install failed: $?" );

  if( $dbg ){ warn( "results were copied into $output_dir" ); }
}

# cache store?
if( $use_cache )
{
  # We have to combine method ctrl params (%$hash_meth) with result files here and that is not trivial. We are going to have logic here to establish which result file belongs to which method.
  # Associate extensions with method names: %meth_ext_hash
  # lkajan 20110514: there is no need for a precache_dir and an install into it since PP now leaves the working directory in pristine condition DOESNT'T IT?!
  my $precache_dir = $work_dir;
  if( $dbg ){ warn("precache dir = $precache_dir"); }
   
  my $result_files = _ls_dir( $precache_dir );
  foreach my $res_file (@$result_files)
  {
    if( substr( $res_file, 0, 1 ) eq '.' ){ next; }

    # lkajan 20100415: certain result files we do not want to store into the cache in order to save space
    # lkajan: these are result files that are big, derived from other files kept and not needed for the web interface (or anything of importance) so regeneration time is not crucial
    # lkajan: at present these are: safBlastPsi safBlastPsi80 hssp
    # lkajan: When cache_merge is in use, these files need to be actively removed from the precache_dir since in this case it is /the/ cache dir

    # lkajan 20111018: pending new policy for files stored in cache:
    # - we keep files directly shown on the web interface
    # - we keep the immediate input files of (top level) predictors so that
    #   they can be re-run, files such as .hsspPsiFil and .blastpSwissM8
    # - we do not keep ancestors of these files, such as .blastPsiOutTmp and
    #   .hssp80

    # query.blastpSwissM8 => $method->{blastpswiss}->{res}->{blastpSwissM8} = "$precache_dir/$res_file";
    # Also allow file names like: 'query.arch.lc2', $extkey should here be 'arch.lc2'.
    my( $filename, $directories, $suffix ) = fileparse( $res_file, qr/\..*$/o ); # lkajan: we want the longest possible match from the right
    my $extkey = substr( $suffix, 1 );

    my $method = $meth_ext_hash{$extkey};
    if( !$method ){ warn("Warning: extension '$extkey' does not have associated method"); $method = $extkey; }

    $hash_meth->{$method}->{res}->{$extkey} = "$precache_dir/$res_file";
  }
  #if( $dbg ){ cluck( Dumper( $hash_meth ) ); }

  # Now call ppc_store
  my @method_args = map {
    my $method = $_;
    my $ctrl = join(',', map { my $cp = $_; "$cp=$hash_meth->{$method}->{ctrl}->{$cp}"; } keys(%{$hash_meth->{$method}->{ctrl}}) );
    my $res = join(',', map { my $res = $_; "res_$res=$hash_meth->{$method}->{res}->{$res}"; } keys(%{$hash_meth->{$method}->{res}}) );
    "--method=$method".( $ctrl ? ",$ctrl" : '' ).($res ? ",$res" : '' );
  } keys(%$hash_meth);

  my @cmd = (
    'ppc_store',
    ( $setacl ? ( '--setacl', $setacl ) : () ),
    ( $cache_merge ? ( '--merge-res', '--skip-hash-lock' ) :  () ),
    @method_args,
    '--seq', $sequence_in
  );

  if( $dbg ){ cluck( "@cmd" ); }
  system( @cmd ) && die( "@cmd failed: $?" );
}

exit(0);



# lkajan: method - translate and pass --method=norsp,win=50,... --method=... method control parameters to make file
# lkajan: idea: translate control params simply like this: `--method=MNAME,CTRLP1=CTRLVAL1,CTRLP2=CTRLVAL2' -> `NMAMECTRL=--CTRLP1=CTRLVAL1 --CTRLP2=CTRLVAL2'
# lkajan: Also return all methods as targets.
sub               _get_makemethod
{
  my( $__method ) = @_;
  my $makemethod = [];
  my $maketarget = [];
  my $hash_meth = {};

  foreach my $method_cm ( @$__method )
  {
    # split each string in two parts:method and res 
    my ($meth1, $rest)= split /,/o, $method_cm, 2; 

    push @$maketarget, $meth1;
  
    # split rest after and save in an array
    my @tmp_hash; if( $rest ){ @tmp_hash = split /,/o, $rest; }
  
    #look for the crtl and res elements for each method
    my @tmp_crl = grep( !/^res_/o, @tmp_hash );
    my @tmp_res = grep( /^res_/o, @tmp_hash );

    if( @tmp_res ){ warn( "Warning: `res_'-type parameters were given ('@tmp_res') - these are ignored by this program\n" ); }

    if( @tmp_crl )
    {
      # Certain methods like profbval do not accept Getopt::Long-style cmd line parameters. That's bad. We can not pass parameters to such methods because we do not want to program special cases for them.
      # The long term solution is to make these methods accept Getopt::Long params.
      if(
        $meth1 eq 'prof' ||
        $meth1 eq 'profbval' ||
        $meth1 eq 'norsnet'
      ){ die("Error: this interface does not support passing control parameters to $meth1 because of ${meth1}'s primitive command line interface.".( $dbg ? '' : "\n" )); }
    }
  
    #store the results from each crt and res element as one hash
    push @$makemethod, uc( $meth1 ).'CTRL="'.join(' ', map {
      my $ctrlpair = $_;
      if( $ctrlpair =~ /^-/o ){ confess( "Error: control parameter with leading `-': '$ctrlpair'" ); } "--$_";
    } @tmp_crl ).'"';
    
    my $ctrl = { map{ split(/=/o, $_, 2 ); }@tmp_crl };
    $hash_meth->{$meth1}{'ctrl'} = $ctrl;
  }

  return( $makemethod, $maketarget, $hash_meth );
}


sub               _ls_dir
{
  my( $__dir ) = @_;
  opendir( my $dh, $__dir ) || confess( "failed to open $__dir: $!" );
  my $files = [ readdir( $dh ) ];
  closedir( $dh );
  return $files;
}


sub               _get_method_options
{
  my( $__hash_meth ) = @_;

  my @ret = map {
    my $method = $_;
    my $ctrl = join(',', map {
      my $ctrl = $_; "$ctrl=$__hash_meth->{$method}->{ctrl}->{$ctrl}";
    } keys(%{$__hash_meth->{$method}->{ctrl}}) );
    ( '--method', $method.( $ctrl ? ",$ctrl" : '' ) );
  } keys(%$__hash_meth);

  return [ @ret ];
}


sub               _in_cache
{
  # lkajan: this is a very quick check, it should not do anything else but tell if the cache slot is empty or not
  # lkajan: warning: race condition here: whatever this call finds may not be true by the time ppc_lock or another fetch is executed
  my( $__seq, $__hash_meth_as_options ) = @_;

  my @ppf_cmd = _get_ppf_cmd_base( $__seq, $__hash_meth_as_options ); # ppc_fetch --seq ... --method ... --method ...
  push @ppf_cmd, "--print-dir";

  if( $dbg ){ cluck( "@ppf_cmd" ); }

  my( @cachefiles, $closeret );
  {
    my $pid = open( my $pipe, '-|' ); if( !defined( $pid ) ){ die("failed to execute '|@ppf_cmd': $?"); }
    if( !$pid ){ if( !exec { $ppf_cmd[0] } @ppf_cmd ){ confess( "failed to call @ppf_cmd: $!" ); } }
    @cachefiles = <$pipe>;
    $closeret = close( $pipe );
    if( !$closeret && $! ){ confess("failed to call @ppf_cmd: $!"); }
  }

  return $closeret;
}


sub               _get_ppf_cmd_base
{
  my( $__seq, $__hash_meth_as_options ) = @_;
  
  my @ret = ( 'ppc_fetch', '--seq', $sequence_in, @$__hash_meth_as_options );
}


__END__

=head1 NAME

predictprotein - analyse protein sequence

=head1 SYNOPSIS

predictprotein [--blast-processors] [--num-cpus|c] [--debug|d] [--help] [--make-file|m] [--makedebug] [--man] [--method] [--dryrun|n] [--numresmax] [--output-dir|o] [--print-ext-method-map] [--profnumresmin] [--psicexe] [--prot-name|p] [--sequence|seq|s] [--seqfile] [--spkeyidx] [--target]* [--version|v] [--work-dir|w]

predictprotein [--bigblastdb] [--big80blastdb] [--pfam2db] [--pfam3db] [--prodomblastdb] [--prositedat] [--prositeconvdat] [--swissblastdb]

predictprotein [--setacl|acl] [--<no>cache-merge] [--<no>force-cache-store] [--<no>use-cache]

=head1 DESCRIPTION

predictprotein runs a set of protein sequnce analysis methods:

=head2 Standard methods

These methods are run by the default target 'all':

 Feature                Target            Extension               Man page
 -------                ------            ---------               --------
 atom mobility          profbval          profbval, profb4snap    profbval(1)
 bacterial transmem-    proftmb           proftmb, proftmbdat     proftmb(1)
  brane beta barrels
 coiled-coils           coiledcoils       coils, coils_raw        coils-wrap(1)
                                                                  ncoils(1)
 disulfide bridges      disulfinder       disulfinder             disulfinder(1)
 Gene Ontology terms    metastudent       metastudent.BPO.txt,    metastudent(1)
                                          metastudent.CCO.txt,
                                          metastudent.MFO.txt
 local alignment        blast             blastPsiOutTmp, chk,    blastpgp(1)
                                          blastPsiMat,
                                          blastPsiAli,
                                          blastpSwissM8           blastall(1)
 local complexity       ncbi-seg          segNorm, segNormGCG     ncbi-seg(1)
 non-regular secondary  norsp             nors, sumNors           norsp(1)
  structure
 nuclear localization   predictnls        nls, nlsDat, nlsSum     predictnls(1)
 Pfam scan hmmer v2     hmm2pfam          hmm2pfam                hmm2pfam(1)
 Pfam scan hmmer v3     hmm3pfam          hmm3pfam, hmm3pfamTbl,  hmmscan(1)
                                          hmm3pfamDomTbl
 PROSITE scan           prosite           prosite                 prosite_scan(1)
 protein-protein        profisis          isis                    profisis(1)
  interaction sites
 secondary structure,   prof              profRdb                 prof(1)
  accessibility from
  sequence profile
 secondary structure,   prof              prof1Rdb                prof(1)
  accessibility from
  single sequence
 secondary structure,   reprof            reprof                  reprof(1)
  accessibility from
  single sequence
 transmembrane          phd               phdPred, phdRdb         prof(1)
  helices
 unstructured loops     norsnet           norsnet                 norsnet(1)

=head2 Optional methods

These methods are non-redistributable or depend on non-redistributable software (indicated by '*').  You have to acquire the non-redistributable components yourself before you can use these methods.

These methods are run by the target 'optional'.

 Feature                Target            Extension               Man page
 -------                ------            ---------               --------
 disordered regions     metadisorder      mdisorder               metadisorder(1)
 subcellular            loctree3          {arch,bact,euka}.lc3    loctree3(1)
                        tmhmm*            tmhmm                   n.a.
 protein-RNA,           somena            somena                  somena(1)
 protein-DNA
  interaction sites
 DNA- , RNA- and        prona             prona                   prona2019(1)
  Protein-binding
  protein and binding
  sites
 position-specific      psic*             psic, clustalngz        psic(1),
  independent counts                                              runNewPSIC(1),
  and its base multi-                                             clustalw(1)
  ple alignment
 transmembrane helices  tmhmm             tmhmm                   n.a.
                        tmseg             tmseg                   tmseg(1)
 functional regions     consurf           _consurf.grades         consurf(1)

=head2 Resources

 Database                             Cmd line argument
 --------                             -----------------
 big (Uniprot+PDB) blast database     --bigblastdb
 big_80 (big @ 80% sequence identity  --big80blastdb
   redunancy level) blast database
 swiss blast database                 --swissblastdb
 pfam v2 database                     --pfam2db
 pfam v3 database                     --pfam3db
 prosite_convert.dat                  --prositeconvdat

=head3 Resources for optional targets

 Database                             Cmd line argument
 --------                             -----------------
 big (Uniprot+PDB) blast database     --bigblastdb
 prosite.dat                          --prositedat
 Swiss-Prot keyword-to-accession      --spkeyidx
  'index' for loctree

=head2 Generating Resources

Courtesy of Wiktor Jurkowski:

 * rostlab-data-prosite_convert prosite.dat prosite_convert.dat
 * perl /usr/share/loctree/perl/keyindex4loctree.pl < keyindex.txt > keyindex_loctree.txt
 * hmmpress Pfam-A.hmm

=head2 Output format

Method outputs are deposited into B<--output-dir>.  Each method has one or more file name extensions associated with it, see the table above.  Refer to the man page of the individual methods for further details.  Extensions ending with `gz' are compressed with gzip(1).

=head1 REFERENCES

=over

=item Rost, B., Yachdav, G., and Liu, J. (2004). The PredictProtein server. Nucleic Acids Res, 32(Web Server issue), W321-6.

=back

In case you find predictprotein and the tools within useful please cite:

* the references for PredictProtein, see above

* the references for the tools you used, see REFERENCES on the man page of the tool

=head1 OPTIONS

=over

=item B<--blast-processors>

Number of processors to use, default = 1

=item B<-c>, B<--num-cpus>

Make jobs, default = 1

=item B<-d>, B<--debug>

=item B<--help>

Print a brief help message and exits.

=item B<-m>, B<--make-file>

make file to use, default = __pkgdatadir__/MakefilePP.mk

=item B<--makedebug>

debug argument for make, see make(1)

=item B<--man>

This documentation page

=item B<--method>

Describes method control parameters and requests methods to run when B<--target> is not I<all>. Format example:
 
 --method=norsp,win=50
 
* begin with the method name, e.g. `norsp'

* list method control parameters, e.g. win=50

Not all methods support passing control parameters in this way due to their primitive command line interfaces.

=item B<-n>, B<--dryrun>

Do not execute, just shows what is about to be run

=item B<--numresmax>

Maximum sequence length, default: I<6000>. Sequences longer than this will make predictprotein fail with the respective error code, see L<ERRORS>.

=item B<-o>, B<--output-dir>

Final location of outputfiles, required unless caching is used.

=item B<--print-ext-method-map>

Print externsion-to-method map.  Useful as input file for consistency checkers.  Format: <extension><tab><method>.

=item B<--profnumresmin>

Minimum sequence length required by prof, default: I<17>. Sequences shorter than this will make predictprotein fail with the respective error code, see L<ERRORS>.

=item B<--psicexe>

psic wrapper executable, default: /usr/share/rost-runpsic/runNewPSIC.pl

=item B<-p>, B<--prot-name>

Base name of result files and protein name in - for example - FASTA files. Default = `query'.

Valid names are of the character set C<[[:alnum:]._-]>.

=item B<-s>, B<--seq>, B<--sequence>

one letter amino acid sequence input

=item B<--seqfile>

FASTA amino acid sequence file; if `-', standard input is read

=item B<--spkeyidx>

Swiss-Prot keyword-to-identifier 'index' file for loctree(1).

=item B<--target>=I<string>

Method groups to run.  Give this argument for each target you need.  Default: the value of `default_targets' in the configuration file; `all' if that is not given.

Some targets of interest:

=over

=item I<all>

methods that are GPL or redistributable to non-commercial entities

=item I<optional>

methods that do not fit into I<all>

=back

Look at __pkgdatadir__/MakefilePP.mk for a list of targets ("Use the source Luke").

=item B<-v>, B<--version>

Print package version

=item B<-w>, B<--work-dir>

Working directory, optional

=back

=head2 Database options

=over

=item B<--bigblastdb>

Path to comprehensive blast database

=item B<--big80blastdb>

Path to comprehensive blast database at 80% sequence identity redundancy level

=item B<--pfam2db>

Pfam v2 database, e.g. F<Pfam_ls>

=item B<--pfam3db>

Pfam v3 database, e.g. F<Pfam-A.hmm>

=item B<--prodomblastdb>

Obsolete.  This argument is kept only to maintain compatibility with older versions.

=item B<--prositedat>

Path to `prosite.dat' file, see L<https://rostlab.org/owiki/index.php/Packages#Resource_definitions>

=item B<--prositeconvdat>

Path to `prosite_convert.dat' file, see L<https://rostlab.org/owiki/index.php/Packages#Resource_definitions>

=item B<--swissblastdb>

Path to SwissProt blast database

=back

=head2 Cache related options

=over

=item B<--acl>, B<--setacl>

Set access control lists.  Access control lists are set I<only> in case results are stored in the cache.  This option is ineffective otherwise.
All previous ACLs are lost - no merging.  The read bit controls browsability of results. Other bits are not used. E.g.

 u:lkajan:4,u:gyachdav:4,g:lkajan:4,o::0

=item B<--cache-merge>

=item B<--nocache-merge>

Merge/do not merge results into cache.  B<--cache-merge> reuses results already in cache; this turns B<--use-cache> on automatically.  B<--cache-merge> is incompatible with B<--force-cache-store>.

B<--nocache-merge> is the default UNLESS

=over

=item * B<--use-cache> is on and

=item * B<--noforce-cache-store> is in effect and

=item * B<--target> is used and

=item * the cache is not empty

=back

B<--cache-merge> is silently ignored in case the cache is empty.

=item B<--force-cache-store>

=item B<--noforce-cache-store>

Enable/disable forcing storage of results into cache.  Implies B<--use-cache>.  Default: B<--noforce-cache-store>

With B<--noforce-cache-store> when predictprotein finds cached results it simply fetches them from the cache and does no processing (even if the results are incomplete).  With B<--force-cache-store> predictprotein does not fetch anything from the cache but does store the results, completely replacing what was cached.

B<--force-cache-store> is incompatible with B<--cache-merge>.

=item B<--use-cache>

=item B<--nouse-cache>

Use/do not use cache for predictprotein results.  Default: B<--nouse-cache>.

Option `use_cache' may be given in configuration files to override default.

=back

=head1 ERRORS

=over

=item I<253>

Sequence is too long, see B<--numresmax>

=item I<254>

Sequence is too short, shorter than minimum length required by prof. See B<--profnumresmin>.

=back

=head1 EXAMPLES

 predictprotein --seqfile __docdir__/examples/tquick.fasta --output-dir /tmp/pp 

 predictprotein --seqfile __docdir__/examples/tquick.fasta --output-dir /tmp/pp --target query.profRdb --target loctree3

 predictprotein --seqfile __docdir__/examples/tquick.fasta --method=norsp,win=100 --output-dir /tmp/pp 

=head2 Cache examples

=over

=item Store results in cache, do not care about storing files in B<--output-dir>:

 predictprotein --seqfile __docdir__/examples/tquick.fasta --method=norsp,win=100 --use-cache --setacl g:rostlab:7

=item If not in cache store, otherwise fetch results from cache into B<--output-dir>:

 predictprotein --seqfile __docdir__/examples/tquick.fasta --method=norsp,win=100 --use-cache --setacl g:rostlab:7 --output-dir /tmp/pp

=back

=head1 ENVIRONMENT

=over

=item PREDICTPROTEINCONF

Location of predictproteinrc configuration file to use, overriding other configuration files

=back

=head1 FILES

=over

=item F<__pkgdatadir__/predictproteinrc.default>

Default configuration file. See this file for a description of the parameters.

=item F<__sysconfdir__/predictproteinrc>

System configuration file overriding values in F<__pkgdatadir__/predictproteinrc.default>

=item F<~/.predictproteinrc>

User configuration file overriding values in F<__sysconfdir__/predictproteinrc>

=back

=head1 NOTES

=head2 Popularity Contest

The pp-popularity-contest package included with this image sets up a cron job
that will periodically anonymously submit to the Rost Lab developers
statistics about the most used Rost Lab packages on this system.

This information helps us making decisions such as which packages
should receive high priority when fixing bugs.
It also helps us decide which packages should receive funding for further
development and support.
This information is also very important when the Rost Lab applies for funding.

Without the funding we receive based on the usage statistics you volunteer
none of the packages on this image could be made available to you at no cost.

In case you do not wish to participate in the popularity contest please
remove the pp-popularity-contest package.

=head1 AUTHOR

Burkhard Rost, Antoine de Daruvar, Jinfeng Liu, Guy Yachdav, Laszlo Kajan

=head1 SEE ALSO

ppc_store(1), ppc_fetch(1), ppqsub(1)

=cut

# vim:et:ts=2:ai:
